Try to compare different algorithms on pure single cell results. Both the 10x and C1 SmartSeq sections are using the 81,881 entry ENCODE4 bulk annotation set.
Cell ranger and STAR Solo only produce Gene level count quantifications. When processing 10x data kallisto and salmon alevin produce gene counts as well.
When processing C1 data, Kallisto & Salmon only produce transcript quantifications, so for C1 gene level comparisons, transcript level quantifications are created by grouping all the transcripts by their gene id and summing those groups.
This notebook reuses the correlation method developed for the ENCODE3 Bulk RNA pipeline
My implementation replicate_scores.
Original R version MAD.R
import numpy
import scanpy
import anndata
import os
import pandas
from matplotlib import pyplot
from matplotlib import cm
import scipy
from pathlib import Path
import upsetplot
import louvain
import sklearn
import seaborn
import logging
from pipeline_common import get_gene_id_to_name, load_gtf
#from cuml.decomposition import PCA
#from cuml.manifold import TSNE
logger = logging.getLogger('notebook')
analysis_dir = Path('ENCSR874BOF_e10_5_limb')
scanpy.__version__
cellr_common = scanpy.read_h5ad(analysis_dir / 'cellranger_filtered.sparse.h5ad')
cellr_common
solo_common = scanpy.read_h5ad(analysis_dir / 'solo_filtered.sparse.h5ad')
solo_common
alevin_common = scanpy.read_h5ad(analysis_dir / 'alevin_filtered.h5ad')
alevin_common
kallisto_common = scanpy.read_h5ad(analysis_dir / 'kallisto_filtered.h5ad')
kallisto_common
def replicate_scores(rep1, rep2, rep1_name, rep2_name, Acutoff=0):
"""Compute correlations, MAD, and SD replicate comparison scores
Parameters
----------
table: pandas.DataFrame
DataFrame of RSEM quantifications
rep1_name: str
name of one column to score (library name)
rep2_name: str
name of the other column to score (library name)
Acutoff: float
Require that the average of the two replicates is greater
than this cutoff in log2 space
Returns
-------
pandas.Series
Containing all the scores between the two replicates
"""
#rep1 = table[rep1_name]
#rep2 = table[rep2_name]
eitherzero = (rep1 == 0) | (rep2 == 0)
replz1 = numpy.log2(rep1[eitherzero != True])
replz2 = numpy.log2(rep2[eitherzero != True])
M = replz1 - replz2
A = (replz1 + replz2) / 2.0
replz1_gt_Acutoff = replz1[A > Acutoff]
replz2_gt_Acutoff = replz2[A > Acutoff]
skip_rafa = False
if len(replz1_gt_Acutoff) < 100:
#logger.warning("No data survived Acutoff filter in %s", rep1_name)
skip_rafa = True
if len(replz2_gt_Acutoff) < 100:
#logger.warning("No data survived Acutoff filter in %s", rep2_name)
skip_rafa = True
if not skip_rafa:
rafa_pearson = scipy.stats.pearsonr(replz1_gt_Acutoff,
replz2_gt_Acutoff)[0]
rafa_spearman= scipy.stats.spearmanr(replz1_gt_Acutoff,
replz2_gt_Acutoff)[0]
else:
rafa_pearson = numpy.nan
rafa_spearman = numpy.nan
scores = pandas.Series({
'total_rows': len(rep1),
'passed_filter': len(replz1[A > Acutoff]),
'naive_pearson': scipy.stats.pearsonr(rep1, rep2)[0],
'naive_spearman': scipy.stats.spearmanr(rep1, rep2)[0],
'rafa_pearson': rafa_pearson,
'rafa_spearman': rafa_spearman,
'MAD': numpy.round(numpy.median(numpy.abs(M)[A > Acutoff]) * 1.4826, 3),
'SD': numpy.round(numpy.sqrt(numpy.mean(M[A > Acutoff] ** 2)), 3)
},
index=['total_rows', 'passed_filter',
'naive_pearson', 'naive_spearman',
'rafa_pearson', 'rafa_spearman',
'MAD', 'SD']
)
return scores
dense_10x = {
'cellranger': cellr_common.to_df().T,
'solo': solo_common.to_df().T,
'alevin': alevin_common.to_df().T,
'kallisto': kallisto_common.to_df().T,
#'kallisto EM': numpy.array(kallisto_em_common.X.todense()),
}
def compute_correlations(dense):
programs = list(dense.keys())
cell_correlations = {}
for name_x in programs:
for name_y in programs[programs.index(name_x):]:
assert dense[name_x].shape == dense[name_y].shape
for cell_id in dense[name_x].columns:
cs_cors = replicate_scores(dense[name_x][cell_id], dense[name_y][cell_id], name_x, name_y)
cell_correlations.setdefault(name_x, {}).setdefault(name_y, {})[cell_id] = cs_cors
return cell_correlations
Just a warning... the all vs all comparison is slow.
tenx_correlations = compute_correlations(dense_10x)
len(tenx_correlations['cellranger']['solo'])
def plot_cell_correlation_histogram(table, metric, title=None, bins=50):
programs = list(table.keys())
cell_hists = {}
f = pyplot.figure(figsize=(7,7))
if title is not None:
f.suptitle(title.format(metric=metric))
plot_size = len(programs)-1
axes = f.subplots(plot_size, plot_size, sharex=True, sharey=True)
for x, name_x in enumerate(programs):
for y, name_y in enumerate(programs[programs.index(name_x)+1:]):
#plot_index = plot_size * (y+x) + x + 1
#ax = f.add_subplot(plot_size, plot_size, plot_index)
ax = axes[y+x, x]
if x == 0:
ax.set_ylabel(name_y)
spearman = []
for cell_id in table[name_x][name_y]:
spearman.append(table[name_x][name_y][cell_id][metric])
#spearman = numpy.asarray(spearman)
spearman = numpy.array(spearman)
spearman = spearman[~numpy.isnan(spearman)]
count = len(spearman)
median = numpy.median(spearman)
mean = numpy.mean(spearman)
cell_hists.setdefault(name_x, {})[name_y] = ax.hist(spearman, bins=bins, density=True)
ax.annotate(f'Mean {mean:0.2}\nMedian {median:0.2}\nCount {count}', xy=(0.1, 0.6), xycoords='axes fraction')
for y in range(plot_size):
axes[0, y].set_title(programs[y])
axes[plot_size-1, y].set_xlabel(programs[y])
#f.tight_layout()
def extract_score(correlations, metric, name_x, name_y):
score = {}
#name_x='cellranger'
#name_y='kallisto'
for cell_id in correlations[name_x][name_y]:
score[cell_id] = correlations[name_x][name_y][cell_id][metric]
return pandas.Series(score)
def sc_scatter(table, correlations, metric, name_x, name_y, cell_id, ax=None):
gridalpha = 0.5
def is_spike(x):
if x.startswith('gSpikein_') or x.startswith('tSpikein_'):
return True
else:
return False
if ax is None:
f = pyplot.figure()
ax = f.subplots(1,1)
set1 = cm.get_cmap('Set1').colors
colors = [{True: set1[0], False: set1[1]}[is_spike(x)] for x in table[name_x].index]
ax.plot([-5,10], [-5,10], c=set1[2])
ax.scatter(numpy.log2(table[name_x][cell_id]+0.01), numpy.log2(table[name_y][cell_id]+0.01), color=colors, s=2)
rafa_spearman = correlations[name_x][name_y][cell_id][metric]
passed_filter = correlations[name_x][name_y][cell_id]["passed_filter"]
ax.set_title(f'id {cell_id}\n{metric} {rafa_spearman:0.4}\nCount {int(passed_filter)}')
ax.set_xlabel(name_x)
ax.set_ylabel(name_y)
ax.grid(color='dimgrey', linestyle='-', linewidth=0.5, which="both", alpha = gridalpha)
def show_scatter_extremes(dense_mat, correlations, metric, name_x, name_y):
scores = extract_score(correlations, metric, name_x, name_y)
best_cell = scores.idxmax()
worst_cell = scores.idxmin()
# find smallest absolute difference from median and use that as median cell
median = numpy.abs(scores - scores.median())
median_cell = median.idxmin()
f = pyplot.figure(figsize=(12,4))
#f.suptitle(f'{metric} worst, median and best cells')
axes = f.subplots(1,3, sharex=True, sharey=True)
sc_scatter(dense_mat, correlations, metric, name_x, name_y, worst_cell, ax=axes[0])
sc_scatter(dense_mat, correlations, metric, name_x, name_y, median_cell, ax=axes[1])
sc_scatter(dense_mat, correlations, metric, name_x, name_y, best_cell, ax=axes[2])
I have several slides where I wanted to get a sense of how the different algorithms performed on a cell.
What I settled on is a panel of 3 scatter plots that show the cell with the lowest correlation, the median correlation, and the best correlation between two selected algorithms.
Data points are in blue, spike-in values are in red and the line y=x is in green.
For 10x data there were no spike ins added, so all the spikes should be at 0,0 for naive, or min_threshold = $log_2 0.01$. Once in a while there's a read spuriously maps to the spike ins and so there might be one non-zero spike in the 10x data.
These plots are derived from ENCSR874BOF our e10.5 mouse forelimb dataset. They were analyzed with indicies built from the full ENCODE Bulk RNA-seq GTF file. This is unusual, many other 10x comparisons are done with a much smaller set that focuses on protein coding, lincRNA and antisense genes as well as the TR and IG annotations. A quick inspection suggests processed transcripts and pseudogenes appear to be the largest categories removed. See compare-10x-vs-ENCODE-gtf for more details.
The 10x cells selected were the intersection of Cellranger, STAR Solo, and Alevin methods, which happens to be the cells selected by STAR Solo.
plot_cell_correlation_histogram(tenx_correlations, 'naive_spearman','Histogram of 10x per-cell {metric} correlations', bins=50)
show_scatter_extremes(dense_10x, tenx_correlations, 'naive_spearman', 'cellranger', 'solo')
show_scatter_extremes(dense_10x, tenx_correlations, 'naive_spearman', 'cellranger', 'alevin')
show_scatter_extremes(dense_10x, tenx_correlations, 'naive_spearman', 'cellranger', 'kallisto')
show_scatter_extremes(dense_10x, tenx_correlations, 'naive_spearman', 'solo', 'alevin')
show_scatter_extremes(dense_10x, tenx_correlations, 'naive_spearman', 'solo', 'kallisto')
show_scatter_extremes(dense_10x, tenx_correlations, 'naive_spearman', 'alevin', 'kallisto')
plot_cell_correlation_histogram(tenx_correlations, 'rafa_spearman','Histogram of 10x per-cell {metric} correlations', bins=50)
show_scatter_extremes(dense_10x, tenx_correlations, 'rafa_spearman', 'cellranger', 'solo')
show_scatter_extremes(dense_10x, tenx_correlations, 'rafa_spearman', 'cellranger', 'alevin')
show_scatter_extremes(dense_10x, tenx_correlations, 'rafa_spearman', 'cellranger', 'kallisto')
show_scatter_extremes(dense_10x, tenx_correlations, 'rafa_spearman', 'solo', 'alevin')
show_scatter_extremes(dense_10x, tenx_correlations, 'rafa_spearman', 'solo', 'kallisto')
show_scatter_extremes(dense_10x, tenx_correlations, 'rafa_spearman', 'alevin', 'kallisto')
suffix = '_gene_counts.h5ad'
c1_gene_counts = {}
for filename in sorted(Path('c1_pseudo/').glob('*'+suffix)):
name = filename.name[:-len(suffix)]
h5ad = scanpy.read_h5ad(filename)
c1_gene_counts[name] = h5ad.to_df().T
print(name, c1_gene_counts[name].shape)
c1_gene_count_correlations = compute_correlations(c1_gene_counts)
plot_cell_correlation_histogram(c1_gene_count_correlations, 'naive_spearman','Histogram of per-cell {metric} C1 count correlations')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'naive_spearman', 'kallisto', 'rsem')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'naive_spearman', 'kallisto', 'salmon_decoy')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'naive_spearman', 'kallisto', 'salmon')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'naive_spearman', 'kallisto', 'star')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'naive_spearman', 'rsem', 'salmon_decoy')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'naive_spearman', 'rsem', 'salmon')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'naive_spearman', 'rsem', 'star')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'naive_spearman', 'salmon_decoy', 'salmon')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'naive_spearman', 'salmon_decoy', 'star')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'naive_spearman', 'salmon', 'star')
plot_cell_correlation_histogram(c1_gene_count_correlations, 'rafa_spearman','Histogram of per-cell {metric} C1 count correlations')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'rafa_spearman', 'kallisto', 'rsem')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'rafa_spearman', 'kallisto', 'salmon_decoy')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'rafa_spearman', 'kallisto', 'salmon')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'rafa_spearman', 'kallisto', 'star')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'rafa_spearman', 'rsem', 'salmon_decoy')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'rafa_spearman', 'rsem', 'salmon')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'rafa_spearman', 'rsem', 'star')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'rafa_spearman', 'salmon_decoy', 'salmon')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'rafa_spearman', 'salmon_decoy', 'star')
show_scatter_extremes(c1_gene_counts, c1_gene_count_correlations, 'rafa_spearman', 'salmon', 'star')
suffix = '_gene_tpms.h5ad'
c1_gene_tpms = {}
for filename in sorted(Path('c1_pseudo/').glob('*'+suffix)):
name = filename.name[:-len(suffix)]
h5ad = scanpy.read_h5ad(filename)
c1_gene_tpms[name] = h5ad.to_df().T
print(name, c1_gene_tpms[name].shape)
c1_gene_tpm_correlations = compute_correlations(c1_gene_tpms)
plot_cell_correlation_histogram(c1_gene_tpm_correlations, 'naive_spearman','Histogram of per-cell {metric} C1 gene TPM correlations', bins=50)
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'naive_spearman', 'kallisto', 'rsem')
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'naive_spearman', 'kallisto', 'salmon_decoy')
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'naive_spearman', 'kallisto', 'salmon')
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'naive_spearman', 'rsem', 'salmon_decoy')
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'naive_spearman', 'rsem', 'salmon')
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'naive_spearman', 'salmon_decoy', 'salmon')
plot_cell_correlation_histogram(c1_gene_tpm_correlations, 'rafa_spearman','Histogram of per-cell {metric} C1 gene TPM correlations', bins=50)
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'rafa_spearman', 'kallisto', 'rsem')
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'rafa_spearman', 'kallisto', 'salmon_decoy')
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'rafa_spearman', 'kallisto', 'salmon')
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'rafa_spearman', 'rsem', 'salmon_decoy')
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'rafa_spearman', 'rsem', 'salmon')
show_scatter_extremes(c1_gene_tpms, c1_gene_tpm_correlations, 'rafa_spearman', 'salmon_decoy', 'salmon')
suffix = '_transcript_counts.h5ad'
c1_transcript_counts = {}
for filename in sorted(Path('c1_pseudo/').glob('*'+suffix)):
name = filename.name[:-len(suffix)]
h5ad = scanpy.read_h5ad(filename)
c1_transcript_counts[name] = h5ad.to_df().T
print(name, c1_transcript_counts[name].shape)
c1_transcript_count_correlations = compute_correlations(c1_transcript_counts)
plot_cell_correlation_histogram(c1_transcript_count_correlations, 'naive_spearman','Histogram of per-cell {metric} C1 transcript count correlations', bins=50)
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'naive_spearman', 'kallisto', 'rsem')
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'naive_spearman', 'kallisto', 'salmon_decoy')
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'naive_spearman', 'kallisto', 'salmon')
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'naive_spearman', 'rsem', 'salmon_decoy')
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'naive_spearman', 'rsem', 'salmon')
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'naive_spearman', 'salmon_decoy', 'salmon')
plot_cell_correlation_histogram(c1_transcript_count_correlations, 'rafa_spearman','Histogram of per-cell {metric} C1 transcript count correlations', bins=50)
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'rafa_spearman', 'kallisto', 'rsem')
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'rafa_spearman', 'kallisto', 'salmon_decoy')
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'rafa_spearman', 'kallisto', 'salmon')
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'rafa_spearman', 'rsem', 'salmon_decoy')
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'rafa_spearman', 'rsem', 'salmon')
show_scatter_extremes(c1_transcript_counts, c1_transcript_count_correlations, 'rafa_spearman', 'salmon_decoy', 'salmon')
suffix = '_transcript_tpms.h5ad'
c1_transcript_tpms = {}
for filename in sorted(Path('c1_pseudo/').glob('*'+suffix)):
name = filename.name[:-len(suffix)]
h5ad = scanpy.read_h5ad(filename)
c1_transcript_tpms[name] = h5ad.to_df().T
print(name, c1_transcript_tpms[name].shape)
c1_transcript_tpm_correlations = compute_correlations(c1_transcript_tpms)
plot_cell_correlation_histogram(c1_transcript_tpm_correlations, 'naive_spearman','Histogram of per-cell {metric} C1 transcript TPM correlations', bins=50)
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'naive_spearman', 'kallisto', 'rsem')
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'naive_spearman', 'kallisto', 'salmon_decoy')
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'naive_spearman', 'kallisto', 'salmon')
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'naive_spearman', 'rsem', 'salmon_decoy')
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'naive_spearman', 'rsem', 'salmon')
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'naive_spearman', 'salmon_decoy', 'salmon')
plot_cell_correlation_histogram(c1_transcript_tpm_correlations, 'rafa_spearman','Histogram of per-cell {metric} C1 transcript TPM correlations', bins=50)
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'rafa_spearman', 'kallisto', 'rsem')
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'rafa_spearman', 'kallisto', 'salmon_decoy')
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'rafa_spearman', 'kallisto', 'salmon')
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'rafa_spearman', 'rsem', 'salmon_decoy')
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'rafa_spearman', 'rsem', 'salmon')
show_scatter_extremes(c1_transcript_tpms, c1_transcript_tpm_correlations, 'rafa_spearman', 'salmon_decoy', 'salmon')